###########################################
### Article: Where do parties interact? ###
### Task:    Analysis - Preparation     ###
### Title:   02_Analysis_Preparation.R  ###
###########################################

#--------------------------------------------------------------------------------------------------
# Description:
#
# This script loads the classified textual data (party press releases and tweets) based on the
# Structural Topic Model (STM) and prepares the main data set for the (statistical) analysis.
#--------------------------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

start <- Sys.time()

# Load packages

library(tidyverse)
library(dplyr)
library(xlsx)
library(data.table)
library(manifestoR)

#---------------------------------------------------------------------------------------------------------------------

# Load data

data <- readRDS("STM_Result.RDS") %>%
  filter(topic_label != "NA") %>%
  filter(topic_label != "Event information") %>%
  filter(topic_label != "Thanks and wishes") %>%
  filter(topic_label != "Legislation and initiatives") %>%
  filter(!is.na(topic_label))
data$country <- as.character(data$country)

#---------------------------------------------------------------------------------------------------------------------

# Press releases

## filter for press releases
data_pr <- filter(data, type == "Press release")

## party dyads per day --> check existence of parties (e.g. PILZ/JETZT, BDP/PBD)
df_dyads_pr <- c()
for(i in 1:length(unique(data$party))) {
  
  party_list <- unique(data[, c("country", "party")])
  tmp_party <- as.character(party_list[i, "party"])
  tmp_country <- as.character(party_list[i, "country"])
  
  other_parties <- filter(data, 
                          party != tmp_party & 
                            country == as.character(tmp_country)) %>% 
    select(party) %>% 
    unique()
  
  tmp_res <- data.frame(country = tmp_country,
                        date = seq.Date(from = as.Date("2019-01-01"),
                                        to = as.Date("2021-09-26"), 
                                        by = "day"),
                        party = tmp_party) %>% right_join(
                          data.frame(party = tmp_party, source = other_parties$party))
  tmp_res$party_dyad <- paste0(tmp_res$party, "-", tmp_res$source)
  tmp_res$party_dyad_sorted <- paste0(
    t(data.frame(
      lapply(str_split(tmp_res$party_dyad, pattern = "-"), sort)))[, 1],
    "-",
    t(data.frame(
      lapply(str_split(tmp_res$party_dyad, pattern = "-"), sort)))[, 2]
    )

  df_dyads_pr <- rbind(df_dyads_pr, tmp_res)
}

## party dyads per day and issue
df_dyads_pr <- df_dyads_pr %>%
  cbind(issue = rep(sort(unique(data$topic_label)), each = nrow(df_dyads_pr)))

## filter Grüne/GA, PILZ/JETZT and Mitte (only partial representation in parliament)
idx1 <- which(df_dyads_pr$party == "Grüne/GA" & df_dyads_pr$date <= "2019-09-29")
idx2 <- which(df_dyads_pr$party == "PILZ/JETZT" & df_dyads_pr$date > "2019-09-29")
idx3 <- which(df_dyads_pr$party == "Mitte" & df_dyads_pr$date < "2021-01-01")
idx4 <- which(df_dyads_pr$party == "BDP/PBD" & df_dyads_pr$date >= "2021-01-01")
idx5 <- which(df_dyads_pr$party == "CVP/PDC" & df_dyads_pr$date >= "2021-01-01")
idx6 <- which(df_dyads_pr$source == "Grüne/GA" & df_dyads_pr$date <= "2019-09-29")
idx7 <- which(df_dyads_pr$source == "PILZ/JETZT" & df_dyads_pr$date > "2019-09-29")
idx8 <- which(df_dyads_pr$source == "Mitte" & df_dyads_pr$date < "2021-01-01")
idx9 <- which(df_dyads_pr$source == "BDP/PBD" & df_dyads_pr$date >= "2021-01-01")
idx10 <- which(df_dyads_pr$source == "CVP/PDC" & df_dyads_pr$date >= "2021-01-01")

idx <- sort(c(idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9, idx10))
df_dyads_pr <- df_dyads_pr[-idx,]

## calculate issue engagement

df_tmp <- df_dyads_pr
colnames(df_tmp)[7] <- "topic_label"
df_info <- data_pr
colnames(df_info)[3] <- "source"

### source attention
df_source <- left_join(df_tmp, df_info, 
                       by = c("country", "source", "date", "topic_label")) %>%
  mutate(n = ifelse(!is.na(text), 1, 0)) %>%
  group_by(country, date,
             party, source,
             party_dyad, party_dyad_sorted, 
             topic_label) %>%
  summarise(attention_source = sum(n))

### party attention at t+0, t+1, t+2 and t+3
colnames(df_info)[3] <- "party"
df_party_date_plus0 <- left_join(df_tmp, df_info, 
                                 by = c("country", "party", "date", "topic_label")) %>%
  mutate(n = ifelse(!is.na(text), 1, 0)) %>%
  group_by(country, date,
           party, source,
           party_dyad, party_dyad_sorted, 
           topic_label) %>%
  summarise(attention_party_date_plus0 = sum(n))

df_party_date_plus1 <- df_party_date_plus0
colnames(df_party_date_plus1)[8] <- "attention_party_date_plus1"
df_party_date_plus1$date <- df_party_date_plus1$date - 1

df_party_date_plus2 <- df_party_date_plus0
colnames(df_party_date_plus2)[8] <- "attention_party_date_plus2"
df_party_date_plus2$date <- df_party_date_plus2$date - 2

df_party_date_plus3 <- df_party_date_plus0
colnames(df_party_date_plus3)[8] <- "attention_party_date_plus3"
df_party_date_plus3$date <- df_party_date_plus3$date - 3

df_party <- left_join(df_party_date_plus0, df_party_date_plus1) %>%
  left_join(df_party_date_plus2) %>%
  left_join(df_party_date_plus3) %>%
  mutate(attention_party_tplus0 = attention_party_date_plus0) %>%
  mutate(attention_party_tplus1 = attention_party_date_plus0 + 
           attention_party_date_plus1) %>%
  mutate(attention_party_tplus2 = attention_party_date_plus0 + 
           attention_party_date_plus1 +
           attention_party_date_plus2) %>%
  mutate(attention_party_tplus3 = attention_party_date_plus0 + 
           attention_party_date_plus1 +
           attention_party_date_plus2 +
           attention_party_date_plus3)

df_res_pr <- left_join(df_source, df_party)
df_res_pr$type <- "Press release"
df_res_pr <- df_res_pr[, c(1:6, 17, 7:16)]

## check df_res
any(is.na(df_res_pr))
summary(df_res_pr)
table(df_res_pr$attention_source)
table(df_res_pr$attention_party_tplus0)
table(df_res_pr$attention_party_tplus1)
table(df_res_pr$attention_party_tplus2)
table(df_res_pr$attention_party_tplus3)

#---------------------------------------------------------------------------------------------------------------------

# Twitter

## filter for tweets
data_tw <- filter(data, type == "Tweet")

## party dyads per day --> check existence of parties (e.g. PILZ/JETZT, BDP/PBD)
df_dyads_tw <- c()
for(i in 1:length(unique(data$party))) {
  
  party_list <- unique(data[, c("country", "party")])
  tmp_party <- as.character(party_list[i, "party"])
  tmp_country <- as.character(party_list[i, "country"])
  
  other_parties <- filter(data, 
                          party != tmp_party & 
                            country == as.character(tmp_country)) %>% 
    select(party) %>% 
    unique()
  
  tmp_res <- data.frame(country = tmp_country,
                        date = seq.Date(from = as.Date("2019-01-01"),
                                        to = as.Date("2021-09-26"), 
                                        by = "day"),
                        party = tmp_party) %>% right_join(
                          data.frame(party = tmp_party, source = other_parties$party))
  tmp_res$party_dyad <- paste0(tmp_res$party, "-", tmp_res$source)
  tmp_res$party_dyad_sorted <- paste0(
    t(data.frame(
      lapply(str_split(tmp_res$party_dyad, pattern = "-"), sort)))[, 1],
    "-",
    t(data.frame(
      lapply(str_split(tmp_res$party_dyad, pattern = "-"), sort)))[, 2]
  )
  
  df_dyads_tw <- rbind(df_dyads_tw, tmp_res)
}

## party dyads per day and issue
df_dyads_tw <- df_dyads_tw %>%
  cbind(issue = rep(sort(unique(data$topic_label)), each = nrow(df_dyads_tw)))

## filter Grüne/GA, PILZ/JETZT and Mitte (only partial representation in parliament)
idx1 <- which(df_dyads_tw$party == "Grüne/GA" & df_dyads_tw$date <= "2019-09-29")
idx2 <- which(df_dyads_tw$party == "PILZ/JETZT" & df_dyads_tw$date > "2019-09-29")
idx3 <- which(df_dyads_tw$party == "Mitte" & df_dyads_tw$date < "2021-01-01")
idx4 <- which(df_dyads_tw$party == "BDP/PBD" & df_dyads_tw$date >= "2021-01-01")
idx5 <- which(df_dyads_tw$party == "CVP/PDC" & df_dyads_tw$date >= "2021-01-01")
idx6 <- which(df_dyads_tw$source == "Grüne/GA" & df_dyads_tw$date <= "2019-09-29")
idx7 <- which(df_dyads_tw$source == "PILZ/JETZT" & df_dyads_tw$date > "2019-09-29")
idx8 <- which(df_dyads_tw$source == "Mitte" & df_dyads_tw$date < "2021-01-01")
idx9 <- which(df_dyads_tw$source == "BDP/PBD" & df_dyads_tw$date >= "2021-01-01")
idx10 <- which(df_dyads_tw$source == "CVP/PDC" & df_dyads_tw$date >= "2021-01-01")

idx <- sort(c(idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9, idx10))
df_dyads_tw <- df_dyads_tw[-idx,]

## calculate issue engagement

df_tmp <- df_dyads_tw
colnames(df_tmp)[7] <- "topic_label"
df_info <- data_tw
colnames(df_info)[3] <- "source"

### source attention
df_source <- left_join(df_tmp, df_info, 
                       by = c("country", "source", "date", "topic_label")) %>%
  mutate(n = ifelse(!is.na(text), 1, 0)) %>%
  group_by(country, date,
           party, source,
           party_dyad, party_dyad_sorted, 
           topic_label) %>%
  summarise(attention_source = sum(n))

### party attention at t+0, t+1, t+2 and t+3
colnames(df_info)[3] <- "party"
df_party_date_plus0 <- left_join(df_tmp, df_info, 
                                 by = c("country", "party", "date", "topic_label")) %>%
  mutate(n = ifelse(!is.na(text), 1, 0)) %>%
  group_by(country, date,
           party, source,
           party_dyad, party_dyad_sorted, 
           topic_label) %>%
  summarise(attention_party_date_plus0 = sum(n))

df_party_date_plus1 <- df_party_date_plus0
colnames(df_party_date_plus1)[8] <- "attention_party_date_plus1"
df_party_date_plus1$date <- df_party_date_plus1$date - 1

df_party_date_plus2 <- df_party_date_plus0
colnames(df_party_date_plus2)[8] <- "attention_party_date_plus2"
df_party_date_plus2$date <- df_party_date_plus2$date - 2

df_party_date_plus3 <- df_party_date_plus0
colnames(df_party_date_plus3)[8] <- "attention_party_date_plus3"
df_party_date_plus3$date <- df_party_date_plus3$date - 3

df_party <- left_join(df_party_date_plus0, df_party_date_plus1) %>%
  left_join(df_party_date_plus2) %>%
  left_join(df_party_date_plus3) %>%
  mutate(attention_party_tplus0 = attention_party_date_plus0) %>%
  mutate(attention_party_tplus1 = attention_party_date_plus0 + 
           attention_party_date_plus1) %>%
  mutate(attention_party_tplus2 = attention_party_date_plus0 + 
           attention_party_date_plus1 +
           attention_party_date_plus2) %>%
  mutate(attention_party_tplus3 = attention_party_date_plus0 + 
           attention_party_date_plus1 +
           attention_party_date_plus2 +
           attention_party_date_plus3)

df_res_tw <- left_join(df_source, df_party)
df_res_tw$type <- "Tweet"
df_res_tw <- df_res_tw[, c(1:6, 17, 7:16)]

## check df_res
any(is.na(df_res_tw))
summary(df_res_tw)
table(df_res_tw$attention_source)
table(df_res_tw$attention_party_tplus0)
table(df_res_tw$attention_party_tplus1)
table(df_res_tw$attention_party_tplus2)
table(df_res_tw$attention_party_tplus3)

#---------------------------------------------------------------------------------------------------------------------

# Main data
df_res <- rbind(df_res_pr, df_res_tw)

#---------------------------------------------------------------------------------------------------------------------

# Government/opposition data - Party and source

df_res$party_government <- 0
df_res$source_government <- 0

## Austria

### Party
df_res$party_government[which(df_res$party == "ÖVP" & 
                                df_res$date < as.Date("2019-06-03"))] <- 1
df_res$party_government[which(df_res$party == "FPÖ/VDU" & 
                                df_res$date < as.Date("2019-06-03"))] <- 1
df_res$party_government[which(df_res$party == "ÖVP" & 
                                df_res$date >= as.Date("2020-01-07"))] <- 1
df_res$party_government[which(df_res$party == "Grüne/GA" & 
                                df_res$date >= as.Date("2020-01-07"))] <- 1

### Source
df_res$source_government[which(df_res$source == "ÖVP" & 
                                 df_res$date < as.Date("2019-06-03"))] <- 1
df_res$source_government[which(df_res$source == "FPÖ/VDU" & 
                                 df_res$date < as.Date("2019-06-03"))] <- 1
df_res$source_government[which(df_res$source == "ÖVP" & 
                                 df_res$date >= as.Date("2020-01-07"))] <- 1
df_res$source_government[which(df_res$source == "Grüne/GA" & 
                                 df_res$date >= as.Date("2020-01-07"))] <- 1

## Germany

### Party
df_res$party_government[which(df_res$party == "CDU/CSU")] <- 1
df_res$party_government[which(df_res$party == "SPD")] <- 1

### source
df_res$source_government[which(df_res$source == "CDU/CSU")] <- 1
df_res$source_government[which(df_res$source == "SPD")] <- 1

## Switzerland
### Government is based on consociationalism and the four biggest parties automatically form the government. Therefore, no classic government-opposition divide can be observed.

### Party
df_res$party_government[which(df_res$party == "CVP/PDC")] <- 1
df_res$party_government[which(df_res$party == "Mitte")] <- 1
df_res$party_government[which(df_res$party == "FDP/PLR")] <- 1
df_res$party_government[which(df_res$party == "SPS/PSS")] <- 1
df_res$party_government[which(df_res$party == "SVP/UDC")] <- 1

### source
df_res$source_government[which(df_res$source == "CVP/PDC")] <- 1
df_res$source_government[which(df_res$source == "Mitte")] <- 1
df_res$source_government[which(df_res$source == "FDP/PLR")] <- 1
df_res$source_government[which(df_res$source == "SPS/PSS")] <- 1
df_res$source_government[which(df_res$source == "SVP/UDC")] <- 1

## check data
table(df_res$party_government, df_res$party)
table(df_res$source_government, df_res$source)

#---------------------------------------------------------------------------------------------------------------------

# Party-level data (MARPOR data)

## download and prepare MARPOR data
mp_data <- mp_maindataset(api = "") %>% # provide Manifesto Project API key
  filter(countryname == "Austria" | 
           countryname == "Germany" | 
           countryname == "Switzerland") %>%
  filter(edate >= "2015-01-01")

mp_data$countryname <- gsub("Austria", "AT", mp_data$countryname)
mp_data$countryname <- gsub("Germany", "DE", mp_data$countryname)
mp_data$countryname <- gsub("Switzerland", "CH", mp_data$countryname)
mp_data$country <- mp_data$countryname

mp_data$partyabbrev <- gsub("FPÖ", "FPÖ/VDU", mp_data$partyabbrev)
mp_data$partyabbrev <- gsub("90/Greens", "Bündnis90/Die Grünen", mp_data$partyabbrev)
mp_data$partyabbrev <- gsub("GRÜNE", "Grüne/GA", mp_data$partyabbrev)
mp_data$partyabbrev <- gsub("LINKE", "Linke/PDS", mp_data$partyabbrev)
mp_data$partyabbrev <- gsub("PILZ", "PILZ/JETZT", mp_data$partyabbrev)

mp_data$party <- mp_data$partyabbrev

## data for "Mitte" --> CVP/PDC and BDP/PBD since 2021
## pervote and absseats --> sum of CVP/PDC and BDP/PBD
## rile -> CVP/PDC as proxy
mp_data_ch_mitte <- filter(mp_data, party == "CVP/PDC", edate == "2019-10-20")
mp_data_ch_mitte$party <- "Mitte"
mp_data_ch_mitte$partyname <- "Mitte"
mp_data_ch_mitte$partyabbrev <- "Mitte"
mp_data_ch_mitte$pervote <- mp_data_ch_mitte$pervote +
  mp_data$pervote[which(mp_data$party == "BDP/PBD" & mp_data$edate == "2019-10-20")]
mp_data_ch_mitte$absseat <- mp_data_ch_mitte$absseat +
  mp_data$absseat[which(mp_data$party == "BDP/PBD" & mp_data$edate == "2019-10-20")]
mp_data <- rbind(mp_data, mp_data_ch_mitte)

## relevant manifesto
df_res$edate <- as.Date("1970-01-01")
df_res$edate[which(df_res$country == "AT" & 
                     df_res$date <= "2019-09-29")] <- as.Date("2017-10-15")
df_res$edate[which(df_res$country == "AT" & 
                     df_res$date > "2019-09-29")] <- as.Date("2019-09-29")

df_res$edate[which(df_res$country == "DE")] <- as.Date("2017-09-24")

df_res$edate[which(df_res$country == "CH" & 
                     df_res$date <= "2019-10-20")] <- as.Date("2015-10-18")
df_res$edate[which(df_res$country == "CH" & 
                     df_res$date > "2019-10-20")] <- as.Date("2019-10-20")

summary(df_res$edate)

## party election results, party seats and rile
mp_data_party <- mp_data %>%
  select(c("country", "edate", "party", "pervote", "absseat", "totseats", "rile"))
colnames(mp_data_party)[c(4, 5, 7)] <- c("party_pervote", "party_absseat", "party_rile")
df_res <- left_join(df_res, mp_data_party,
                    by = c("country", "party", "edate"))

## source election results, party seats and rile
mp_data_source <- mp_data %>%
  select(c("country", "edate", "party", "pervote", "absseat", "totseats", "rile"))
colnames(mp_data_source)[c(3, 4, 5, 7)] <- c("source", "source_pervote", "source_absseat", "source_rile")
df_res <- left_join(df_res, mp_data_source,
                    by = c("country", "source", "edate", "totseats"))

df_res$party_perseat <- round((df_res$party_absseat / df_res$totseats) * 100, digits = 3)
df_res$source_perseat <- round((df_res$source_absseat / df_res$totseats) * 100, digits = 3)
df_res$rile_difference <- abs(df_res$party_rile - df_res$source_rile)

any(is.na(df_res))
summary(df_res)

#---------------------------------------------------------------------------------------------------------------------

# Election campaign period (6 weeks ahead of election)

df_res$election_campaign <- 0

df_res$election_campaign[which(df_res$country == "AT" &
                                 df_res$date <= "2019-09-29" & 
                                 df_res$date >= (as.Date("2019-09-29") - (6*7)))] <- 1

df_res$election_campaign[which(df_res$country == "CH" &
                                 df_res$date <= "2019-10-20" & 
                                 df_res$date >= (as.Date("2019-10-20") - (6*7)))] <- 1

df_res$election_campaign[which(df_res$country == "DE" &
                                 df_res$date <= "2021-09-26" & 
                                 df_res$date >= (as.Date("2021-09-26") - (6*7)))] <- 1

table(df_res$election_campaign)

#---------------------------------------------------------------------------------------------------------------------

# Referendum campaign period (6 weeks ahead of referendum)

df_res$referendum_campaign <- 0

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2019-02-10" & 
                                   df_res$date >= (as.Date("2019-02-10") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2019-05-19" & 
                                   df_res$date >= (as.Date("2019-05-19") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2020-02-09" & 
                                   df_res$date >= (as.Date("2020-02-09") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2020-09-27" & 
                                   df_res$date >= (as.Date("2020-09-27") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2020-11-29" & 
                                   df_res$date >= (as.Date("2020-11-29") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2021-03-07" & 
                                   df_res$date >= (as.Date("2021-03-07") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2021-06-13" & 
                                   df_res$date >= (as.Date("2021-06-13") - (6*7)))] <- 1

df_res$referendum_campaign[which(df_res$country == "CH" &
                                   df_res$date <= "2021-09-26" & 
                                   df_res$date >= (as.Date("2021-09-26") - (6*7)))] <- 1

table(df_res$referendum_campaign)

#---------------------------------------------------------------------------------------------------------------------

# Calculate issue engagement variable

df_res$issue_engagement_tplus0 <- ifelse(
  df_res$attention_source > 0 & df_res$attention_party_tplus0 > 0, 1, 0)

df_res$issue_engagement_tplus1 <- ifelse(
  df_res$attention_source > 0 & df_res$attention_party_tplus1 > 0, 1, 0)
df_res$issue_engagement_tplus1[is.na(df_res$attention_party_date_plus1) == TRUE &
                                 !is.na(df_res$issue_engagement_tplus1) == TRUE] <- NA

df_res$issue_engagement_tplus2 <- ifelse(
  df_res$attention_source > 0 & df_res$attention_party_tplus2 > 0, 1, 0)
df_res$issue_engagement_tplus2[is.na(df_res$attention_party_date_plus2) == TRUE &
                                 !is.na(df_res$issue_engagement_tplus2) == TRUE] <- NA

df_res$issue_engagement_tplus3 <- ifelse(
  df_res$attention_source > 0 & df_res$attention_party_tplus3 > 0, 1, 0)
df_res$issue_engagement_tplus3[is.na(df_res$attention_party_date_plus3) == TRUE &
                                 !is.na(df_res$issue_engagement_tplus3) == TRUE] <- NA

summary(df_res$issue_engagement_tplus0)
summary(df_res$issue_engagement_tplus1)
summary(df_res$issue_engagement_tplus2)
summary(df_res$issue_engagement_tplus3)

#---------------------------------------------------------------------------------------------------------------------

# Finalise data set

df_res <- df_res[, c(1:6, 20, 31, 32, 21:22, 28, 18, 24:26, 29, 19, 27, 30, 7:17, 33:36)]

colnames(df_res)[7] <- "last_election"

#---------------------------------------------------------------------------------------------------------------------

# Filter to remove irrelevant observations

idx_fpo <- sort(c(
  which(df_res$party == "FPÖ/VDU" & df_res$type == "Tweet"),
  which(df_res$source == "FPÖ/VDU" & df_res$type == "Tweet")))
df_res <- df_res[-idx_fpo, ]

idx_pilz <- sort(c(
  which(df_res$party == "PILZ/JETZT" & df_res$type == "Tweet"),
  which(df_res$source == "PILZ/JETZT" & df_res$type == "Tweet")))
df_res <- df_res[-idx_pilz, ]

#---------------------------------------------------------------------------------------------------------------------

# Sanity checks

any(is.na(df_res))

summary(df_res)

table(df_res$party, df_res$country)

table(df_res$topic_label)

#---------------------------------------------------------------------------------------------------------------------

# Save data set

saveRDS(df_res, "Main_Data.RDS")

Sys.time() - start

#---------------------------------------------------------------------------------------------------------------------
